---
title: "RQ2: Artificially Attritioned Datasets"
format:
html:
embed-resources: true
fig-width: 8
fig-height: 8
code-link: true
code-fold: true
code-tools: true
df-print: paged
toc: true
grid:
body-width: 900px
editor: source
editor_options:
chunk_output_type: console
---
# Data cleaning
## Load Results
The script [ 2_rq2_1_run_bootstrapping.R ](2_rq2_1_run_bootstrapping.R) {target="_blank"} runs the bootstrap analyses and saves the results - which are then cleaned and presented below.
```{r}
#| output: false
source ("0_load_data.R" )
rm (rq1y, rq1y_labels, rq1y_labels_clean)
# Load new bootstrap results
boot_results = readRDS (file = file.path ("results" , "2_boot_results.Rds" ))
number_timepoints = length (rq1y_twin1)
number_bootstraps = length (boot_results)/ number_timepoints
# ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Post-processing of results ---------------------------------------------------
# ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
joblist = expand.grid (
b = 1 : number_bootstraps,
i = 1 : number_timepoints
)
### Extract data from boot_results ---------------------------------------------
md_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
result_vec = boot_results[[j]]$ md[[1 ]]
data.frame (
.dataset = joblist[j, "i" ],
.boot = joblist[j, "b" ],
t (setNames (result_vec, rq2y))
)
}))
smd_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
result_vec = boot_results[[j]]$ smd[[1 ]]
data.frame (
.dataset = joblist[j, "i" ],
.boot = joblist[j, "b" ],
t (setNames (result_vec, rq2y))
)
}))
var_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
result_vec = boot_results[[j]]$ var[[1 ]]
data.frame (
.dataset = joblist[j, "i" ],
.boot = joblist[j, "b" ],
t (setNames (result_vec, rq2y))
)
}))
cor_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
result_vec = boot_results[[j]]$ cor_resid[[1 ]]
data.frame (
.dataset = joblist[j, "i" ],
.boot = joblist[j, "b" ],
t (result_vec)
)
}))
srmr_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
data.frame (
.dataset = joblist[j, "i" ],
.boot = joblist[j, "b" ],
srmr = boot_results[[j]]$ srmr[[1 ]]
)
}))
ace_df = do.call (rbind, lapply (seq_along (boot_results), function (j) {
ace_est = boot_results[[j]]$ ace[[1 ]]$ ace_estimates
ace_diff = boot_results[[j]]$ ace[[1 ]]$ ace_differences
result_df = rbind (ace_est, ace_diff)
result_df$ .dataset = joblist[j, "i" ]
result_df$ .boot = joblist[j, "b" ]
return (result_df)
}))
rownames (md_df) = NULL
rownames (smd_df) = NULL
rownames (var_df) = NULL
rownames (cor_df) = NULL
rownames (srmr_df) = NULL
bootstrap_iter = list (md = md_df, smd = smd_df, var = var_df, cor_resid = cor_df, srmr = srmr_df)
### Calculate p-values and confidence intervals per dataset --------------------
bootstrap_summary_df = lapply (names (bootstrap_iter)[1 : 3 ], function (stat_name) {
df = bootstrap_iter[[stat_name]]
dataset = df %>%
pivot_longer (
cols = ! starts_with ("." )
) %>%
group_by (.dataset, name) %>%
summarise (
.mean_qi_pd (value),
.groups = "drop"
) %>%
mutate (
dataset = rq1y_twin_labels_clean_extrashort[.dataset],
variable = rq2y_labels_short[match (name, rq2y)],
stat = stat_name
) %>%
select (- .dataset, - name) %>%
relocate (dataset, variable, stat)
})
bootstrap_summary_df = do.call (rbind, bootstrap_summary_df)
rownames (bootstrap_summary_df) = NULL
### Calculate bootstrap summaries for ACE separately ---------------------------
bootstrap_summary_df_ace = ace_df %>%
filter (name != "amohqualn1" ) %>%
mutate (
group = recode (group,
"df1" = "Attritioned" ,
"df2" = "Original" ,
"diff" = "Difference"
),
dataset = rq1y_twin1[.dataset],
dataset_label = rq1y_twin_labels_clean_extrashort[.dataset]
) %>%
group_by (par, name, sex, group, dataset, dataset_label) %>%
summarise (
.mean_qi_pd (value),
.groups = "drop"
)
variable_comparisons_df = bootstrap_summary_df %>%
group_by (dataset, stat) %>%
mutate (
pval_adj = stats:: p.adjust (pval, method = "holm" )
)
```
## Create Attritioned Datasets
To remove participants, I've set the rows to NA rather than remove the rows, so all the datasets have the same length. This preserves the paired twin structure needed for bootstrapping and ACE analyses.
```{r}
attritioned_datasets = list ()
for (i in seq_along (rq1y_twin1)){
filter = as.numeric (df[[rq1y_twin1[i]]])== 0 # 1 = present (Y), 2 = not-present (N)
attritioned_datasets[[i]] = df %>%
select (all_of (rq2y))
attritioned_datasets[[i]][filter,] = NA
}
original_dataset = df %>%
select (all_of (rq2y))
```
# Comparison of Means, Standardized Mean Differences, and Variances
## Create GT Results table
```{r}
variable_comparisons_df %>%
select (- starts_with ("." ),- pd, - pval) %>%
pivot_wider (
names_from = "stat" ,
values_from = c ("y" ,"ymin" ,"ymax" ,"pval_adj" ),
id_cols = c ("variable" , "dataset" )
) %>%
dplyr:: rename ( Outcome = "variable" ) %>%
gt (groupname_col = "dataset" ) %>%
tab_options (
table.width = px (800 )
# table.border.top.style = "none",
# table.border.bottom.style = "none",
# column_labels.border.bottom.style = "solid",
# column_labels.border.bottom.width = px(1),
# column_labels.border.bottom.color = "black",
# table_body.border.bottom.style = "none",
# table_body.vlines.style = "none",
# row_group.border.bottom.style = "none"
) %>%
# Create column spanners (nested headers)
tab_spanner (
label = "Mean Difference" ,
columns = c (y_md, ymin_md, ymax_md, pval_adj_md)
) %>%
tab_spanner (
label = "Standardized Mean Difference" ,
columns = c (y_smd, ymin_smd, ymax_smd, pval_adj_smd)
) %>%
tab_spanner (
label = "Variance % Change" ,
columns = c (y_var, ymin_var, ymax_var, pval_adj_var)
) %>%
# Rename columns under each spanner
cols_label (
y_md = "Est" , ymin_md = "LB" , ymax_md = "UB" , pval_adj_md = "p" ,
y_smd = "Est" , ymin_smd = "LB" , ymax_smd = "UB" , pval_adj_smd = "p" ,
y_var = "Est" , ymin_var = "LB" , ymax_var = "UB" , pval_adj_var = "p"
) %>%
# Format numbers
fmt (
columns = ! contains ("var" ) & ! contains ("Outcome" ),
fns = function (x) {gbtoolbox:: apa_num (x, n_decimal_places = 3 )}
) %>%
fmt (
columns = "pval_adj_var" ,
fns = function (x) {gbtoolbox:: apa_num (x, n_decimal_places = 3 )}
) %>%
fmt_percent (
columns = c ("y_var" , "ymin_var" ,"ymax_var" ),
decimals = 2 ,
drop_trailing_zeros = FALSE ,
drop_trailing_dec_mark = FALSE
) %>%
# Styling - uniform font size
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_spanners ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_body ()
) %>%
tab_style (
style = cell_text (size = px (10 ), align = "center" ),
locations = cells_row_groups ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_labels ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_footnotes ()
) %>%
# Highlight significant results - positive effects (light green)
tab_style (
style = cell_fill (color = "#d5e8d4" ), # Light green for positive effects
locations = cells_body (
columns = c (y_md, ymin_md, ymax_md, pval_adj_md),
rows = pval_adj_md < 0.05 & y_md > 0
)
) %>%
tab_style (
style = cell_fill (color = "#d5e8d4" ), # Light green for positive effects
locations = cells_body (
columns = c (y_smd, ymin_smd, ymax_smd, pval_adj_smd),
rows = pval_adj_smd < 0.05 & y_smd > 0
)
) %>%
tab_style (
style = cell_fill (color = "#d5e8d4" ), # Light green for positive effects
locations = cells_body (
columns = c (y_var, ymin_var, ymax_var, pval_adj_var),
rows = pval_adj_var < 0.05 & y_var > 0
)
) %>%
# Highlight significant results - negative effects (light red)
tab_style (
style = cell_fill (color = "#f8cecc" ), # Light red for negative effects
locations = cells_body (
columns = c (y_md, ymin_md, ymax_md, pval_adj_md),
rows = pval_adj_md < 0.05 & y_md < 0
)
) %>%
tab_style (
style = cell_fill (color = "#f8cecc" ), # Light red for negative effects
locations = cells_body (
columns = c (y_smd, ymin_smd, ymax_smd, pval_adj_smd),
rows = pval_adj_smd < 0.05 & y_smd < 0
)
) %>%
tab_style (
style = cell_fill (color = "#f8cecc" ), # Light red for negative effects
locations = cells_body (
columns = c (y_var, ymin_var, ymax_var, pval_adj_var),
rows = pval_adj_var < 0.05 & y_var < 0
)
) %>%
# Add footnotes
tab_footnote (
footnote = md ("<em>Note.</em> Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (p<sub>Bonferroni-Holm</sub>) effects are highlighted in green (increases) or red (decreases)." ),
# locations = cells_column_spanners(),
placement = "right"
) %>%
tab_footnote (
footnote = "P values are Bonferroni-Holm adjusted within each participation group" ,
locations = cells_column_labels (columns = contains ("pval_adj" )),
placement = "right"
) %>%
tab_footnote (
footnote = md ("Standardized mean difference = (Mean<sub>original</sub> - Mean<sub>attritioned</sub>) / SD<sub>original</sub>" ),
locations = cells_column_spanners (spanners = "Standardized Mean Difference" ),
placement = "right"
) %>%
tab_footnote (
footnote = md ("Percentage change = (Var<sub>attritioned</sub> - Var<sub>original</sub>) / Var<sub>original</sub>" ),
locations = cells_column_spanners (spanners = "Variance % Change" ),
placement = "right"
) %>%
opt_footnote_marks (marks = c ("*" , "†" ,"‡" ))
```
## Create circular heatmaps for changes in means and variances
### Standardised Mean Differences
```{r}
# Create circular heatmap for SMD
heatmap_data_smd = variable_comparisons_df %>%
filter (stat == "smd" ) %>%
mutate (
outcome_labeled = dataset,
outcome_labeled = factor (outcome_labeled, levels = rq1y_twin_labels_clean_extrashort),
Variables_wrapped = str_wrap (variable, width = 8 ),
# Set fill value to NA if not significant, otherwise keep original sign
fill_value = case_when (
pval_adj < 0.05 ~ y,
TRUE ~ NA
)
) %>%
# Order variables by number of significant effects
group_by (variable) %>%
mutate (n_significant = sum (pval_adj < 0.05 )) %>%
ungroup () %>%
mutate (
Variables_wrapped = factor (Variables_wrapped,
levels = unique (Variables_wrapped[order (- n_significant)]))
)
ggplot (heatmap_data_smd, aes (x = Variables_wrapped, y = outcome_labeled)) +
geom_tile (aes (fill = fill_value,
color = ifelse (variable %in% c ("Emotional problems" , "Prosocial behavior" ), "azure2" , "black" )),
size = 0.5 ) +
scale_color_identity () +
geom_text (
aes (label = ifelse (pval_adj < 0.05 ,
gsub ("^(-?)0 \\ ." , " \\ 1." , sprintf ("%.3f" , y)), "" ),
size = ifelse (outcome_labeled == "Y4" , 2 , 3.5 )
),
color = "black"
) +
scale_size_identity () +
coord_polar (theta = "x" , start = 0 , direction = 1 , clip = "off" ) +
scale_fill_gradient2 (
low = "#2166ac" ,
mid = "white" ,
high = "#d73027" ,
midpoint = 0 ,
# limits = c(-.1,
# max(heatmap_data_smd$fill_value, na.rm = TRUE)),
na.value = "white" ,
guide = "none"
) +
labs (
title = "Standardised Mean Difference" ,
subtitle = expression (frac (bar (X)[attritioned] - bar (X)[original], SD[original])),
tag = "A"
) +
theme_minimal () +
theme (
axis.text.x = element_text (angle = 0 , size = 11 , colour = "black" ),
axis.text.y = element_blank (),
axis.title = element_blank (),
panel.grid = element_blank (),
panel.background = element_blank (),
plot.background = element_blank (),
plot.title = element_text (hjust = 0.5 , size = 16 ),
plot.subtitle = element_text (hjust = 0.5 , size = 13.5 , margin = margin (b = - 20 )),
plot.tag = element_text (hjust = 0 , vjust = 0 , size = 30 , face = "bold" ),
plot.tag.position = "topleft" ,
plot.margin = margin (2 , 10 , - 33 , 10 )
) +
{
y_positions = seq_along (levels (heatmap_data_smd$ outcome_labeled))
labels = levels (heatmap_data_smd$ outcome_labeled)
lapply (seq_along (labels), function (i) {
annotate ("text" , x = 0.5 , y = y_positions[i],
label = paste0 (labels[i], " " ),
size = 3.7 ,
hjust = 1 , angle = 7 )
})
}
save_plot ("2_rq2_circular_heatmap_smd" , width = 8 , height = 8 )
```
### Variance Differences
```{r}
# Create circular heatmap for variance differences
heatmap_data_var = variable_comparisons_df %>%
filter (stat == "var" ) %>%
mutate (
outcome_labeled = dataset,
outcome_labeled = factor (outcome_labeled, levels = rq1y_twin_labels_clean_extrashort),
Variables_wrapped = str_wrap (variable, width = 8 ),
# Set fill value to NA if not significant, otherwise keep original sign
fill_value = case_when (
pval_adj < 0.05 ~ y,
TRUE ~ NA
)
) %>%
# Order variables by number of significant effects
group_by (variable) %>%
mutate (n_significant = sum (pval_adj < 0.05 )) %>%
ungroup () %>%
mutate (
Variables_wrapped = factor (Variables_wrapped,
levels = unique (Variables_wrapped[order (- n_significant)]))
)
ggplot (heatmap_data_var, aes (x = Variables_wrapped, y = outcome_labeled)) +
geom_tile (aes (fill = fill_value,
color = ifelse (variable %in% c ("Emotional problems" , "Prosocial behavior" , "Hyperactivity" , "Grammar" ), "azure2" , "black" )),
size = 0.5 ) +
scale_color_identity () +
geom_text (
aes (label = ifelse (pval_adj < 0.05 ,
paste0 (sprintf ("%.1f" , y * 100 ), "%" ), "" ),
size = ifelse (outcome_labeled == "Y4" , 2 , 3.5 )
),
color = "black"
) +
scale_size_identity () +
coord_polar (theta = "x" , start = 0 , direction = 1 , clip = "off" ) +
scale_fill_gradient2 (
low = "#2166ac" ,
mid = "white" ,
high = "#d73027" ,
midpoint = 0 ,
limits = c (- max (abs (heatmap_data_var$ fill_value), na.rm = TRUE ),
max (abs (heatmap_data_var$ fill_value), na.rm = TRUE )),
na.value = "white" ,
guide = "none"
) +
labs (
title = "Variance Percentage Change" ,
subtitle = expression (frac (Var[attritioned] - Var[original], Var[original]) %*% 100 ),
tag = "B"
) +
theme_minimal () +
theme (
axis.text.x = element_text (angle = 0 , size = 11 , colour = "black" ),
axis.text.y = element_blank (),
axis.title = element_blank (),
panel.grid = element_blank (),
panel.background = element_blank (),
plot.background = element_blank (),
plot.title = element_text (hjust = 0.5 , size = 16 ),
plot.subtitle = element_text (hjust = 0.5 , size = 13.5 , margin = margin (b = - 20 )),
plot.tag = element_text (hjust = 0 , vjust = 0 , size = 30 , face = "bold" ),
plot.tag.position = "topleft" ,
plot.margin = margin (2 , 10 , - 33 , 10 )
) +
{
y_positions = seq_along (levels (heatmap_data_var$ outcome_labeled))
labels = levels (heatmap_data_var$ outcome_labeled)
lapply (seq_along (labels), function (i) {
annotate ("text" , x = 0.5 , y = y_positions[i],
label = paste0 (labels[i], " " ), size = 3.7 , hjust = 1 , angle = 7 )
})
}
save_plot ("2_rq2_circular_heatmap_var" , width = 8 , height = 8 )
```
## Plot data distributions
```{r}
attritioned_datasets_long = attritioned_datasets
for (i in seq_along (attritioned_datasets_long)){
attritioned_datasets_long[[i]] = gather (as.data.frame (as.matrix (attritioned_datasets[[i]])))
attritioned_datasets_long[[i]]$ dataset = rq1y_twin1[i]
}
x = original_dataset %>%
as.matrix () %>% as.data.frame () %>%
gather () %>%
mutate (dataset = "original" )
attritioned_datasets_long = c (attritioned_datasets_long, list (x))
attritioned_datasets_long = do.call (rbind.data.frame,attritioned_datasets_long)
attritioned_datasets_long = attritioned_datasets_long %>%
filter (! is.na (value)) %>%
mutate (
key = rq2y_labels_short[match (key, rq2y)]
)
attritioned_datasets_long %>%
mutate (dataset = factor (dataset)) %>%
ggplot (aes (x = value, group = dataset, col = dataset)) +
geom_density () +
facet_wrap (~ key, scales = "free" )
```
### Alternative: Frequency polygon (better for ordinal/integer data)
Only comparing a single attrition timepoint (Y26 Questionnaire).
```{r}
#| fig-width: 12
#| fig-height: 10
attritioned_datasets_long %>%
filter (! (key %in% c ("Parent-admin cognition" ,"Vocabulary" ))) %>%
filter (dataset %in% c ("original" , "zmhdata1" )) %>%
mutate (
dataset = factor (dataset),
dataset = ifelse (dataset == "original" , "original" , "attritioned" ),
value = as.numeric (value)
) %>%
count (key, value, dataset) %>%
group_by (key, dataset) %>%
mutate (prop = n / sum (n)) %>%
ggplot (aes (x = value, y = prop, col = dataset, group = dataset)) +
geom_line (linewidth = 1 ) +
geom_point (size = 2 ) +
scale_color_manual (values = c ("original" = "steelblue" , "attritioned" = "coral" )) +
facet_wrap (~ key, scales = "free" ) +
labs (
x = "Value" ,
y = "Proportion" ,
title = "Distribution comparison: Original vs Participants at Y26 Questionnaire"
) +
theme_minimal () +
theme (legend.position = "bottom" )
```
# Correlations
The following code creates `x_var` and `y_var` vectors that map correlation column indices (X1, X2, etc.) to variable pairs from the lower triangle of the correlation matrix.
```{r}
test_correlation_matrix = matrix (
nrow = length (rq2y),
ncol = length (rq2y)
)
for (i in seq_along (rq2y)){
for (j in seq_along (rq2y)){
test_correlation_matrix[i,j] = paste (rq2y[i], rq2y[j], collapse = " " )
}
}
x = test_correlation_matrix[lower.tri (test_correlation_matrix)]
x_var = str_extract (x, "^ \\ S+" )
y_var = str_extract (x, " \\ S+$" )
rm (test_correlation_matrix,x)
```
## Plot changes in correlations
```{r}
# Get original column order before pivot
cor_col_order = setdiff (names (cor_df), c (".dataset" , ".boot" ))
correlation_comparisons_df = cor_df %>%
pivot_longer (
cols = ! starts_with ("." )
) %>%
mutate (
name = factor (name, levels = cor_col_order)
) %>%
group_by (.dataset, name) %>%
summarise (
.mean_qi_pd (value),
.groups = "drop"
) %>%
group_by (.dataset) %>%
mutate (
pval_adj = stats:: p.adjust (pval, method = "holm" )
) %>%
ungroup () %>%
mutate (
x_var = x_var[match (name, cor_col_order)],
y_var = y_var[match (name, cor_col_order)]
) %>%
arrange (.dataset, name)
correlation_comparisons = split (correlation_comparisons_df, correlation_comparisons_df$ .dataset)
correlation_comparisons = lapply (correlation_comparisons, function (x) select (x, - .dataset))
names (correlation_comparisons) = rq1y_twin1[as.numeric (names (correlation_comparisons))]
library (patchwork)
plots = lapply (1 : length (correlation_comparisons), function (i) {
correlation_comparisons[[i]] %>%
plot_lower_triangular_matrix2 (
variables = rq2y,
labels = rq2y_labels_short,
title = rq1y_twin_labels_clean_extrashort[i],
caption = NULL ,
p_col = "pval_adj" ,
method = "none"
)
})
patchwork:: wrap_plots (
plots,
ncol = 3 ) +
plot_annotation (
title = "Change in Correlation" ,
subtitle = expression (Cor[attritioned] ~ - ~ Cor[original]),
theme = theme (
plot.title = element_text (hjust = 0.5 , size = 16 , face = "bold" ),
plot.subtitle = element_text (hjust = 0.5 , size = 16 , face = "bold" )
)
)
save_plot (
"2_rq2_2_correlations" , width = 18 , height = 18 )
```
[ 📊 View Full Resolution Correlation Plots (PNG) ](plots/pngs/2_rq2_2_correlations.png)
## Correlations results table
Below is a results table for the change in correlations.
```{r}
cor_table = bind_rows (correlation_comparisons, .id = "timepoint" ) %>%
mutate (
dataset = rq1y_twin_labels_clean_extrashort[match (timepoint, rq1y_twin1)],
x_var_label = rq2y_labels_short[match (x_var, rq2y)],
y_var_label = rq2y_labels_short[match (y_var, rq2y)],
pair = paste (x_var_label, "×" , y_var_label)
) %>%
select (dataset, pair, y, ymin, ymax, pval, pval_adj) %>%
filter (! is.na (dataset)) %>%
gt (groupname_col = "dataset" ) %>%
# tab_options(
# table.width = px(800)
# ) %>%
# Rename columns
cols_label (
pair = "Variable Pair" ,
y = "Est" ,
ymin = "LB" ,
ymax = "UB" ,
pval = md ("p<sub>unadj</sub>" ),
pval_adj = md ("p<sub>adj</sub>" )
) %>%
# Create column spanner
tab_spanner (
label = md ("Correlation difference = Cor<sub>attritioned</sub> - Cor<sub>original</sub>" ),
columns = c (y, ymin, ymax, pval, pval_adj)
) %>%
# Format numbers
fmt (
columns = c (y, ymin, ymax),
fns = function (x) {gbtoolbox:: apa_num (x, n_decimal_places = 3 )}
) %>%
fmt (
columns = c (pval, pval_adj),
fns = function (x) {gbtoolbox:: apa_num (x, n_decimal_places = 3 )}
) %>%
# Styling - uniform font size
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_spanners ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_body ()
) %>%
tab_style (
style = cell_text (size = px (10 ), align = "center" ),
locations = cells_row_groups ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_labels ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_footnotes ()
) %>%
# Highlight significant results - positive effects (light green)
tab_style (
style = cell_fill (color = "#d5e8d4" ),
locations = cells_body (
columns = c (pair, y, ymin, ymax, pval, pval_adj),
rows = pval_adj < 0.05 & y > 0
)
) %>%
# Highlight significant results - negative effects (light red)
tab_style (
style = cell_fill (color = "#f8cecc" ),
locations = cells_body (
columns = c (pair, y, ymin, ymax, pval, pval_adj),
rows = pval_adj < 0.05 & y < 0
)
) %>%
# Add footnotes
tab_footnote (
footnote = md ("<em>Note.</em> Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (p<sub>Bonferroni-Holm</sub>) effects are highlighted in green (increases) or red (decreases)." ),
placement = "right"
) %>%
tab_footnote (
footnote = "P values are Bonferroni-Holm adjusted within each attrition timepoint" ,
locations = cells_column_labels (columns = pval_adj),
placement = "right"
) %>%
opt_footnote_marks (marks = c ("*" , "†" , "‡" ))
cor_table
```
## Plot Original Correlations between variables
```{r}
df %>%
filter (! (randomfamid %in% exclude_fams_onesib)) %>%
select (rq2y) %>%
data.frame () %>%
` colnames<- ` (rq2y_labels_short) %>%
gbtoolbox:: plot_correlations () +
labs (title = "Original (un-attritioned) correlations" )
save_plot (
"2_rq2_2_correlations_original" , width = 8 , height = 8 )
```
[ 📊 View Plot ](plots/pngs/2_rq2_2_correlations_original.png)
# ACE Comparison
## Count of significant ACE differences (unadjusted)
```{r}
bootstrap_summary_df_ace %>%
filter (group == "Difference" ) %>%
summarise (
total_tests = dplyr:: n (),
sig_unadj = sum (pval < 0.05 ),
pct_sig = round (sig_unadj / total_tests * 100 , 1 )
)
```
## ACE differences tables (p(unadjusted) < .05)
```{r}
bootstrap_summary_df_ace %>%
filter (group == "Difference" ) %>%
group_by (dataset_label) %>%
mutate (
pval_adj = stats:: p.adjust (pval, method = "holm" ),
n_adj = dplyr:: n (),
name = rq2y_labels_short[match (name, rq2y)]
) %>%
ungroup () %>%
arrange (name, par, dataset_label) %>%
filter (pval < .05 ) %>%
select (- starts_with ("." ), - dataset, - group, - n, - pd, - n_adj) %>%
mutate (
par = toupper (par),
sex = str_to_title (sex),
row_num = row_number ()
) %>%
relocate (row_num, dataset_label, name, par, sex, y, ymin, ymax, pval, pval_adj) %>%
gt () %>%
cols_label (
row_num = "" ,
dataset_label = "Timepoint" ,
name = "Variable" ,
par = "Par" ,
sex = "Sex" ,
y = "Est" ,
ymin = "LB" ,
ymax = "UB" ,
pval = md ("p<sub>unadj</sub>" ),
pval_adj = md ("p<sub>adj</sub>" )
) %>%
tab_spanner (
label = "ACE Difference (Attritioned - Original)" ,
columns = c (y, ymin, ymax, pval, pval_adj)
) %>%
fmt (
columns = c (y, ymin, ymax, pval, pval_adj),
fns = function (x) {gbtoolbox:: apa_num (x, n_decimal_places = 3 )}
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_spanners ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_body ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_column_labels ()
) %>%
tab_style (
style = cell_text (size = px (10 )),
locations = cells_footnotes ()
) %>%
tab_footnote (
footnote = md ("<em>Note.</em> Est = Estimate, LB = Lower Bound 95% CI, UB = Upper Bound 95% CI, Par = ACE Parameter. Only effects with p<sub>unadjusted</sub> < .05 are shown." ),
placement = "right"
) %>%
tab_footnote (
footnote = "P values are Bonferroni-Holm adjusted within each sex, parameter, and timepoint" ,
locations = cells_column_labels (columns = pval_adj),
placement = "right"
) %>%
opt_footnote_marks (marks = c ("*" , "†" ))
```
## ACE Estimates & Differences Table (complete)
```{r}
bootstrap_summary_df_ace %>%
filter (par %in% c ("a" ,"c" ,"e" )) %>%
group_by (dataset_label) %>%
mutate (
pval = stats:: p.adjust (pval, method = "holm" ),
) %>%
ungroup () %>%
select (- starts_with ("." ), - n, - pd) %>%
mutate (name_clean = rq2y_labels_short[match (name, rq2y)]) %>%
pivot_wider (
id_cols = c (par, name_clean, sex, dataset_label),
names_from = group,
values_from = c (y, ymin, ymax, pval),
names_sep = "_"
) %>%
select (- ymin_Original, - ymax_Original, - ymin_Attritioned, - ymax_Attritioned, - pval_Original, - pval_Attritioned) %>%
select (dataset_label, name_clean, par, y_Original, y_Attritioned, y_Difference, ymin_Difference, ymax_Difference, pval_Difference, sex) %>%
mutate (
name_clean = factor (name_clean, levels = rq2y_labels_short),
dataset_label = factor (dataset_label, levels = rq1y_twin_labels_clean_extrashort)
) %>%
arrange (sex, dataset_label, name_clean, par) %>%
mutate (par = toupper (par)) %>%
gt (groupname_col = "sex" ) %>%
fmt_number (decimals = 3 ) %>%
tab_row_group (
label = "Male" ,
rows = sex == "male"
) %>%
tab_row_group (
label = "Female" ,
rows = sex == "female"
) %>%
cols_hide (sex) %>%
cols_label (
dataset_label = "Timepoint" ,
name_clean = "Variable" ,
par = "" ,
y_Original = "Original" ,
y_Attritioned = "Attritioned" ,
y_Difference = "Diff" ,
ymin_Difference = "Lower" ,
ymax_Difference = "Upper" ,
pval_Difference = "p"
) %>%
tab_spanner (
label = "95% CI" ,
columns = c (ymin_Difference, ymax_Difference)
) %>%
tab_spanner (
label = "Estimates" ,
columns = c (y_Attritioned, y_Original)
) %>%
tab_style (
style = cell_fill (color = "lightgreen" ),
locations = cells_body (
columns = everything (),
rows = pval_Difference < 0.05
)
) %>%
tab_footnote (
footnote = "P values are Bonferroni-Holm adjusted within each sex, parameter (A, C or E), and timepoint group" ,
locations = cells_column_labels (columns = pval_Difference)
)
```
## ACE Stacked Bar Plot
```{r}
bootstrap_summary_df_ace %>%
# filter(dataset_label == "Y4") %>%
filter (par %in% c ("a" ,"c" ,"e" )) %>%
filter (group != "Difference" ) %>%
mutate (
dataset_label = factor (dataset_label, levels = rq1y_twin_labels_clean_extrashort),
name = factor (rq2y_labels_short[match (name, rq2y)], levels = rq2y_labels_short),
group = ifelse (group == "Attritioned" , "A" , "O" ),
group = factor (group, levels = c ("A" , "O" )),
sex = str_to_title (sex),
par = toupper (par)
) %>%
ggplot (aes (x = group, y = y, fill = par)) +
geom_col (position = "stack" , alpha = 1 ) +
facet_grid (dataset_label + sex ~ name, switch = "x" ) +
theme_minimal () +
theme (
axis.text.x = element_text (angle = 0 , hjust = .5 , size = 8 ),
strip.text = element_text (size = 8 ),
strip.text.x = element_text (angle = 90 , hjust = 1 , vjust = .5 , size = 8 ),
strip.text.y = element_text (angle = 0 , size = 10 ),
strip.placement = "outside" ,
panel.grid = element_blank (),
plot.tag = element_text (hjust = 0 , vjust = 0 , size = 24 , face = "bold" ),
plot.tag.position = "topleft"
) +
labs (
title = "Attritioned (A) vs Original (O) ACE Estimates" ,
y = "Proportion" ,
x = NULL ,
fill = "Component"
) +
scale_fill_manual (values = c ("A" = "#d73027" , "C" = "#fee08b" , "E" = "#4575b4" )) +
coord_cartesian (ylim = c (0 , 1 ))
save_plot ("2_rq2_ace_comparison" , width = 7 , height = 15 , trim = TRUE )
# Create a looped version where we have plots for each timepoint
ace_plots = lapply (rq1y_twin_labels_clean_extrashort, function (timepoint) {
bootstrap_summary_df_ace %>%
filter (dataset_label == timepoint) %>%
filter (par %in% c ("a" ,"c" ,"e" )) %>%
filter (group != "Difference" ) %>%
mutate (
name = factor (rq2y_labels_short[match (name, rq2y)], levels = rq2y_labels_short),
group = ifelse (group == "Attritioned" , "A" , "O" ),
group = factor (group, levels = c ("A" , "O" )),
sex = str_to_title (sex),
par = toupper (par)
) %>%
ggplot (aes (x = group, y = y, fill = par)) +
geom_col (position = "stack" , alpha = 1 ) +
facet_grid (sex ~ name, switch = if (timepoint == "Y4" ) "both" else "x" ) +
theme_minimal () +
theme (
axis.text.x = element_text (angle = 0 , hjust = .5 , size = 8 ),
axis.text.y = if (timepoint != "Y4" ) element_blank (),
strip.text = element_text (size = 8 ),
strip.text.x = element_text (angle = 90 , hjust = 1 , vjust = .5 , size = 8 ),
strip.text.y = element_blank (),
strip.text.y.left = if (timepoint == "Y4" ) element_text (angle = 0 , size = 10 ) else element_blank (),
strip.placement = "outside" ,
panel.grid = element_blank ()
) +
labs (
title = timepoint,
y = if (timepoint == "Y4" ) "Proportion" else NULL ,
x = NULL ,
fill = "Component"
) +
scale_fill_manual (values = c ("A" = "#d73027" , "C" = "#fee08b" , "E" = "#4575b4" )) +
coord_cartesian (ylim = c (0 , 1 ))
})
names (ace_plots) = rq1y_twin_labels_clean_extrashort
# Arrange all plots vertically with shared legend
patchwork:: wrap_plots (ace_plots, nrow = 1 ) +
patchwork:: plot_layout (guides = "collect" ) +
patchwork:: plot_annotation (
title = "Attritioned (A) vs Original (O) ACE Estimates by Timepoint"
)
save_plot ("2_rq2_ace_comparison_by_timepoint" , width = 21 , height = 9 , trim = FALSE )
```
[ 📊 View plot 1 ](plots/pngs/2_rq2_ace_comparison.png)
[ 📊 View plot 2 ](plots/pngs/2_rq2_ace_comparison_by_timepoint.png)
# Supplementary Analyses
## How predictive are our variables of later attrition?
### Correlations
```{r}
sapply (rq2y, function (x)
sapply (rq1y_twin1, function (y)
cor (df[[x]],df[[y]], use = "pairwise.complete.obs" )
)) %>%
` colnames<- ` (rq2y_labels_short) %>%
` rownames<- ` (rq1y_twin_labels_clean_extrashort) %>%
knitr:: kable (digits = 3 )
```
### Regression Modelling
```{r}
models = list ()
for (i in seq_along (rq1y_twin1)){
cat (i, "/" , length (rq1y_twin1)," \n " )
formula <- as.formula (paste (rq1y_twin1[i], "~" , paste (rq2y, collapse = "+" )))
models[[i]] = glm (formula, data = df, family = binomial, na.action = "na.omit" )
}
# glm_model_comparison(models[[1]])
#
# lapply(models, function(x) performance::r2(x))
# lapply(models, function(x) calc_auc(x))
# lapply(models, function(x) summary(x))
# calc_auc(models[[1]])
```
#### Plot of AUC and R2 for each timepoint
```{r}
# Extract AUC and pseudo R2 for each model
model_metrics = data.frame (
outcome = sapply (models, function (x) as.character (x$ formula)[2 ]),
AUC = sapply (models, calc_auc),
R2 = sapply (models, function (x) performance:: r2 (x)$ R2)
) %>%
mutate (outcome = rq1y_twin_labels_clean[match (.$ outcome, rq1y_twin1)])
# First, order the outcomes chronologically by extracting year numbers
model_metrics_ordered = model_metrics %>%
mutate (
year_num = as.numeric (str_extract (outcome, " \\ d+" )),
outcome = fct_reorder (outcome, year_num)
)
plot_data = model_metrics_ordered %>%
pivot_longer (cols = c (AUC, R2)) %>%
mutate (
# Format labels
label_text = ifelse (name == "R2" ,
paste0 (sprintf ("%.1f" , value * 100 ), "%" ),
sprintf ("%.3f" , value)),
label_text = ifelse (name == "AUC" ,
gbtoolbox:: apa_num (value, n_decimal_places = 3 ),
label_text),
# Adjust bar start position for AUC (start from 0.5 instead of 0)
value_start = ifelse (name == "AUC" , 0.5 , 0 ),
value_width = value - value_start
)
plot_data %>%
ggplot (aes (y = outcome)) +
geom_rect (aes (xmin = value_start, xmax = value, ymin = as.numeric (outcome) - 0.4 , ymax = as.numeric (outcome) + 0.4 ),
fill = "steelblue" , alpha = 0.7 ) +
geom_text (aes (x = value, label = label_text), hjust = - 0.1 , size = 3 ) +
facet_wrap (~ name, scales = "free" ,
labeller = labeller (name = c ("AUC" = "AUC" , "R2" = "Tjur's R²" ))) +
scale_x_continuous (
labels = function (x) {
# Check if we're in the R2 facet by looking at the range
if (max (x, na.rm = TRUE ) < 0.2 ) { # R2 values are typically small
paste0 (round (x * 100 , 1 ), "%" )
} else {
sprintf ("%.2f" , x)
}
},
expand = expansion (mult = c (0 , 0.15 )),
limits = function (x) {
# Set minimum limit to 0.5 for AUC facet (values > 0.2)
if (max (x, na.rm = TRUE ) > 0.2 ) {
c (0.5 , max (x, na.rm = TRUE ) * 1.05 )
} else {
c (0 , max (x, na.rm = TRUE ) * 1.05 )
}
}
) +
labs (
title = "Prediction Accuracy Across Study Timepoints" ,
x = "Value" ,
y = "Study Timepoint"
) +
theme_minimal () +
theme (
strip.text = element_text (face = "bold" ),
plot.title = element_text (hjust = 0.5 )
)
save_plot ("2_prediction_accuracy" , width = 9 , height = 6 )
```
### Full model results
```{r}
# Extract all coefficients and combine
models_results = do.call (rbind, lapply (1 : length (models), function (i) {
coefs = tidy (models[[i]])
coefs$ outcome = as.character (models[[i]]$ formula)[2 ]
return (coefs)
}))
models_results %>%
select (outcome, everything ()) %>%
mutate (
outcome = var_to_label (outcome) %>% str_remove ("data.*" ),
term = df_labels[match (.$ term,df_colnames)],
term = ifelse (is.na (term), "Intercept" , term),
p_star = as.character (symnum (p.value, cutpoints = c (0 , 0.001 , 0.01 , 0.05 , 0.1 , 1 ), symbols = c ("***" , "**" , "*" , "." , " " )))
) %>%
knitr:: kable (., digits = 3 )
```
### Variable Importance Calculations
Note: Variable importance is calculated by measuring the change in AUC when each variable is removed from the model.
```{r}
glm_model_comparison_results = lapply (models, glm_model_comparison)
glm_model_comparison_results_df = do.call ("rbind.data.frame" , glm_model_comparison_results)
# glm_model_comparison_results_df$LRT_p_star = glm_model_comparison_results_df$LRT_p %>%
# symnum(., cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", "**", "*", ".", " "))
# Adjust p-values and add star
glm_model_comparison_results_df = glm_model_comparison_results_df %>%
filter (Variables_Dropped!= "None" ) %>%
group_by (outcome) %>%
mutate (LRT_p = stats:: p.adjust (LRT_p, method = "holm" )) %>%
ungroup () %>%
mutate (
LRT_p_star = as.character (symnum (LRT_p, cutpoints = c (0 , 0.001 , 0.01 , 0.05 , 0.1 , 1 ), symbols = c ("***" , "**" , "*" , "." , " " )))
)
variable_importance_order = glm_model_comparison_results_df %>%
group_by (Variables_full) %>%
summarise (mean_score = mean (LRT_p)) %>%
arrange (mean_score) %>%
pull (Variables_full)
glm_model_comparison_results_df = glm_model_comparison_results_df %>%
mutate (
Variables_full = factor (Variables_full, levels = variable_importance_order)
)
# Plotting code
glm_model_comparison_results_df %>%
mutate (
Variables_full = rq2y_labels_short[match (.$ Variables_full, rq2y_labels)],
outcome_labeled = rq1y_twin_labels_clean_extrashort[match (.$ outcome, rq1y_twin1)],
criterion = Delta_AUC,
sig_level = case_when (
LRT_p < 0.001 ~ "p < 0.001" ,
LRT_p < 0.01 ~ "p < 0.01" ,
LRT_p < 0.05 ~ "p < 0.05" ,
LRT_p < 0.1 ~ "p < 0.1" ,
TRUE ~ "Not significant"
),
sig_level = factor (sig_level, levels = c ("p < 0.001" , "p < 0.01" , "p < 0.05" , "p < 0.1" , "Not significant" ))
) %>%
ggplot (aes (y = Variables_full, x = criterion)) +
geom_col (aes (fill = sig_level), alpha = 0.8 ) +
geom_text (aes (label = gsub ("0 \\ ." , "." , sprintf ("%.3f" , criterion))), # Remove ALL 0. patterns
size = 2.5 , fontface = "bold" , color = "black" ) +
facet_wrap (~ outcome_labeled, scales = "fixed" , ncol = 6 ) +
scale_x_continuous (labels = function (x) gsub ("0 \\ ." , "." , sprintf ("%.3f" , x))) + # Remove ALL 0. patterns
scale_fill_manual (
values = c ("p < 0.001" = "#d73027" , "p < 0.01" = "#fc8d59" ,
"p < 0.05" = "#fee08b" , "p < 0.1" = "#e0f3f8" ,
"Not significant" = "#d9d9d9" ),
name = "Significance \n (Holm-adjusted)"
) +
labs (
x = "Change in AUC (when variable removed)" ,
y = NULL ,
title = "Variable Importance by AUC Change" ,
subtitle = "Colors show Holm-adjusted significance levels"
) +
theme_minimal () +
theme (
axis.text.y = element_text (size = 7 ),
axis.text.x = element_text (angle = 90 , hjust = 1 , vjust = 0.5 ),
strip.text = element_text (size = 9 , face = "bold" ),
legend.position = "bottom"
)
save_plot ("2_rq2_variable_importance" , width = 14 , height = 7 )
```
## Comparison of maternal education scores
```{r}
df %>%
# group_by(zcdata1) %>%
summarise (
mean = mean (amohqualn, na.rm = TRUE ),
var = stats:: var (amohqualn1, na.rm = TRUE ),
sd = stats:: sd (amohqualn1, na.rm = TRUE )
)
df %>%
group_by (zcdata1) %>%
summarise (
mean = mean (amohqualn, na.rm = TRUE ),
var = stats:: var (amohqualn1, na.rm = TRUE ),
sd = stats:: sd (amohqualn1, na.rm = TRUE )
)
```